setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set to current directory
labels=read.csv("labels.csv")
features=read.csv("tpm_expression.csv")
covariates=read.csv("tcga_pancancer_lung_cov.csv")
sgenes_index=read.csv("sgenes_index.csv") #saved from PAMR analysis
Adjusting rows and columns names and NAs in covariates:
#labels
labels[,2]=factor(labels[,2]) #LABELS FACTORIZED ALL TOGETHER AT BEGINNING
colnames(labels)=c("ID","Label")
#features
row.names(features)=features[,1]
features=features[,-1]
#covariates
row.names(covariates)=covariates[,1]
covariates=as.data.frame(t(covariates[,-1]))
covariates[covariates == "[Not Applicable]" | covariates == "[Not Available]"] <- NA
Train/test split:
#train/test split
set.seed(123)
train_index <- sample(nrow(labels), nrow(labels) * 0.5)
Eventual data transformation to bring data towards normality/stabilising variance:
#leave to all false ---> no transformation
#or select TRUE (at most one) ----> perform selected transformation
log2transf=FALSE
log10transf=FALSE
deseqVST=FALSE
if (log2transf) {
features=log2(1+features) #add +1 to avoid Inf
}
if (log10transf) {
features=log10(1+features)
}
if (deseqVST) {
features <- DESeq2::DESeqDataSetFromMatrix(countData=features,
colData=data.frame(condition=rep(1,ncol(traindata$x))),design = ~ 1)
features <- vst(dds_train, blind=TRUE)
features <- as.matrix(assay(vst_train))
}
Organizing data in the same format as in the PAMR analysis, but factor labels are converted to integer values starting from 0, like xgboost wants labels to be. Features are all numeric so they’re ok but feature matrix is transposed to get variables/genes in the columns and observations in the rows. The data is subsetted to the feature/genes selected in the PAMR analysis with a threshold of \(Th\), the boleaan vector indicating with TRUE the selected genes is in sgenes_index$sgenes_indexTh:
selected=sgenes_index$sgenes_indexTh #getting bolean vector with selected features
traindata=list()
traindata$ynum=as.integer(labels[train_index,2]) - 1
traindata$y=labels[train_index,2]
traindata$x=t(as.matrix(features[selected,train_index]))
traindata$geneid=rownames(features[selected,])
testdata=list()
testdata$ynum=as.integer(labels[-train_index,2]) - 1
testdata$y=labels[-train_index,2]
testdata$x=t(as.matrix(features[selected,-train_index]))
testdata$geneid=rownames(features[selected,])
rm(features) #remove features to empty memory space
numclass=length(unique(traindata$ynum))
## [1] "Nr. training obvs: 545"
## [1] "Nr. test obvs: 546"
## [1] "Nr. of features (genes) selected: 24"
traindata-ynum integer (0,1,2) corresponds to the levels (LUAD,LUSC,NORM) of traindata-y
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
xgb_model <- xgboost(data = traindata$x, label=traindata$ynum, nrounds = 100, objective="multi:softprob", num_class=numclass, max_depth=10, verbose = FALSE)
To produce the silplot visualization, since XGBoost is not natively
supported in classmap, we could use the supplementary
function in classmapExt, vcr.custom.*. To that
we feed just the given/true y and the extracted posterior
probabilieties, this is enough to enable PAC computation and thus the
silplot visual.
Produce the posteriors for train:
traindata$posteriors = predict(xgb_model,traindata$x,reshape=T)
colnames(traindata$posteriors) = levels(traindata$y)
traindata$ypred = apply(traindata$posteriors,1,function(x) colnames(traindata$posteriors)[which.max(x)])
Silplot on Train:
confusionMatrix(factor(traindata$ypred), traindata$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction LUAD LUSC NORM
## LUAD 259 0 0
## LUSC 0 238 0
## NORM 0 0 48
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9933, 1)
## No Information Rate : 0.4752
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: LUAD Class: LUSC Class: NORM
## Sensitivity 1.0000 1.0000 1.00000
## Specificity 1.0000 1.0000 1.00000
## Pos Pred Value 1.0000 1.0000 1.00000
## Neg Pred Value 1.0000 1.0000 1.00000
## Prevalence 0.4752 0.4367 0.08807
## Detection Rate 0.4752 0.4367 0.08807
## Detection Prevalence 0.4752 0.4367 0.08807
## Balanced Accuracy 1.0000 1.0000 1.00000
vcrtrainbase=vcr.custom.train(traindata$y, probs=traindata$posteriors) #feeding only true labels and posteriors
silplot(vcrtrainbase)
## classNumber classLabel classSize classAveSi
## 1 LUAD 259 0.99
## 2 LUSC 238 0.99
## 3 NORM 48 0.99
Produce the posteriors for test set:
testdata$posteriors = predict(xgb_model,testdata$x,reshape=T)
colnames(testdata$posteriors) = levels(traindata$y)
testdata$ypred = apply(testdata$posteriors,1,function(x) colnames(testdata$posteriors)[which.max(x)])
Silplot on test:
confusionMatrix(factor(testdata$ypred), testdata$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction LUAD LUSC NORM
## LUAD 223 11 0
## LUSC 16 241 0
## NORM 3 1 51
##
## Overall Statistics
##
## Accuracy : 0.9432
## 95% CI : (0.9204, 0.9611)
## No Information Rate : 0.4634
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9025
##
## Mcnemar's Test P-Value : 0.1773
##
## Statistics by Class:
##
## Class: LUAD Class: LUSC Class: NORM
## Sensitivity 0.9215 0.9526 1.00000
## Specificity 0.9638 0.9454 0.99192
## Pos Pred Value 0.9530 0.9377 0.92727
## Neg Pred Value 0.9391 0.9585 1.00000
## Prevalence 0.4432 0.4634 0.09341
## Detection Rate 0.4084 0.4414 0.09341
## Detection Prevalence 0.4286 0.4707 0.10073
## Balanced Accuracy 0.9427 0.9490 0.99596
vcrtestbase=vcr.custom.newdata(ynew=testdata$y, probs=testdata$posteriors, vcr.custom.train.out=vcrtrainbase)
silplot(vcrtestbase)
## classNumber classLabel classSize classAveSi
## 1 LUAD 242 0.83
## 2 LUSC 253 0.89
## 3 NORM 51 0.98
To enable the classmap plot we should devise a proper way to measure
the distance of each observation \(i\)
to each given class \(g\) (D(\(i\),\(g\))) according to the trained classifier
view on the data. We should produce a “distance to class’ matrix for all
observations in the train set and another for the ones in the test set.
The matrices produced will be respectly feeded in
vcr.custom.train and vcr.custom.newdata and
will allow for Farness computation.
To produce D(\(i\),\(g\)) for our XGBoost model, which belongs to the large family of tree classifiers, we essentially build upon the same concept that Rousseuw and Raymakers established for basic classification tree algorithms (Supplementary Material section A.3 of Silhouttes and Quasi Residual Plots For Neural Nets and Tree-based Classifiers 2022 by Raymaekers and ROusseeuw).
To do that we start by computing pairwise dissimilarities between the
points in the training set by the gower metric (here
essentially a weighted euclidean since all feature are numeric weighted
for the average importance of the features in the classifier as
outputted in the Gain column after calling xgb.importance
on our model:
traindata$importance=xgb.importance(model=xgb_model)
traindata$weight=rep(0,ncol(traindata$x)) #contains weight for all genes
#in proper order
names(traindata$weight)=colnames(traindata$x)
for (i in 1:nrow(traindata$importance)) {
traindata$weight[as.character(traindata$importance[i,1])]=
as.numeric(traindata$importance[i,2])
}
library(cluster)
traindata$pwd=as.matrix(daisy(traindata$x, metric="gower", weights = traindata$weight))
any(is.na(traindata$pwd)) #check if there is any NAs (could happen if a weight is exactly zero)
## [1] FALSE
#we compute the pairwise dissimilarities also among observations of test:
#(will be needed later for mdscolorscale plot:)
testdata$pwd=as.matrix(daisy(testdata$x, metric="gower", weights = traindata$weight))
any(is.na(testdata$pwd)) #check if there is any NAs (could happen if a weight is exactly zero)
## [1] FALSE
Now for the training set we compute the ‘distance to class’ matrix. We take, as the distance of a generic observation \(i\) to a generic class \(g\), the median among the \(k=5\) smaller dissimilarities (because of locality nature of decision space of tree algorithms) between object \(i\) and the set of objects \(j\) actually belonging to class \(g\).
# Compute neighbors by sorting dissimilarities:
sortNgb <- t(apply(traindata$pwd, 1, order))[, -1] #contains indexes of nearest
#points for each row
sortDis <- t(apply(traindata$pwd, 1, sort))[, -1] #contains dimmilarieties
#values sorted
k=5 #neighbor to consider
yintv=as.numeric(traindata$y)
#create empty structure to fill:
traindata$distToClass <- matrix(rep(NA, nrow(traindata$x) * numclass), ncol = numclass)
for (i in seq_len(nrow(traindata$x))) { #loop over all cases
for (g in seq_len(numclass)) { #loop over classes
ngbg <- which(yintv[sortNgb[i, ]] == g) #getting indexes of all in the same class
if (length(ngbg) > k) {ngbg <- ngbg[seq_len(k)]} #getting the k nearer
traindata$distToClass[i, g] <- median(sortDis[i, ngbg]) #take the median of the k nearer
}
}
Now, to compute the ‘distance to class’ matrix for the cases in the test set we should only use the given considered case and the information from the train set. So each test case is taken separately and the pairwise distances are calculated on the set composed by the training observation plus the test case considered. Then the distance between the case and the generic class is computed exactly like before, that is by taking the median among the k=5 smaller dissimilarities between and the set of training objects actually belonging to class . (running this chunk may take a while cause of the nested loops)
testdata$newDistToClass <- matrix(rep(NA, nrow(testdata$x) * numclass), ncol = numclass)
for (i in 1:nrow(testdata$x)){
testtrain=rbind(testdata$x[i,], traindata$x)
yintv=rbind(as.numeric(testdata$label)[i],as.numeric(traindata$y)) #add also true int label of test for structure consistencies (anyway after will be not considered)
testtrainpwd=as.matrix(daisy(testtrain, metric="gower", weights = traindata$weight))
sortNgb <- t(apply(testtrainpwd, 1, order))[1, -1] #taking first row we take the i test obvs
sortDis <- t(apply(testtrainpwd, 1, sort))[1, -1]
for (g in seq_len(3)) { # loop over classes
ngbg <- which(yintv[sortNgb] == g) #getting indexes of all in the considered
if (length(ngbg) > k) {ngbg <- ngbg[seq_len(k)]} #getting the k nearer
testdata$newDistToClass[i,g] <- median(sortDis[ngbg]) #take the median of the k nearer
}
}
Now it is possible to run vcr.custom.train inputting
also distToClass and allow for farness
computation that in turn allow for classmap plots.
vcrtrain=vcr.custom.train(traindata$y, traindata$posteriors, distToClasses = traindata$distToClass)
par(mfrow=c(1,3))
classmap(vcrtrain, whichclass = 1)
classmap(vcrtrain, whichclass = 2)
classmap(vcrtrain, whichclass = 3)
We also previously computed the pairwise dissimilarities in the
trained-xgboost model view. This enables the mdscolorscale
plot:
mdscolorscale(vcrtrain,diss=traindata$pwd)
Now it is possible to run vcr.custom.newdata inputting
also newDistToClass and allow for farness
computation that in turn allow for classmap plots.
vcrtest=vcr.custom.newdata(ynew = testdata$y , probs = testdata$posteriors , vcr.custom.train.out = vcrtrain, newDistToClasses = testdata$newDistToClass)
par(mfrow=c(1,3))
classmap(vcrtest, whichclass = 1)
classmap(vcrtest, whichclass = 2)
classmap(vcrtest, whichclass = 3)
We also previously computed the pairwise dissimilarities in the
trained-xgboost model view on the whole set of test observations. This
enables the mdscolorscale plot:
#with pairwise distance only among test dataset
mdscolorscale(vcrtest, diss=testdata$pwd)